import fitz  # PyMuPDF
import re
import os
import requests
from tqdm import tqdm
import langdetect

# ----------------------------------------------------------------
# Hard heuristics: Strong indicators of structured questions
# ----------------------------------------------------------------
HARD_HEURISTICS = {
    "enum_num_dot_paren": {"pattern": r"\b\d+\.\)", "min_count": 10, "weight": 3},
    "enum_num_dot_space": {"pattern": r"\b\d+\.\s+", "min_count": 10, "weight": 3},
    "enum_num_paren": {"pattern": r"\b\d+\)\s+", "min_count": 10, "weight": 3},
    "enum_letter_dot_space": {"pattern": r"\b[A-Z]\.\s+", "min_count": 10, "weight": 2},
}

# New hard heuristic: Requires a sequence of at least 10 numbers
SEQUENTIAL_ENUM_REQUIRED = 10  # Must detect at least 10 consecutive numbers

# ----------------------------------------------------------------
# Soft heuristics: Flexible indicators (e.g., question marks, key phrases)
# ----------------------------------------------------------------
SOFT_HEURISTICS = {
    "fill_in_blanks": {"pattern": r"_{3,}", "weight": 2},
    "fill_in_the_blank_phrase": {"pattern": r"fill in the blank", "weight": 3},
    "question_marks": {"pattern": r"\?", "weight": 1},
    "parenthesized_question": {"pattern": r"\(.*\?\)", "weight": 2},
    "longform_words": {
        "pattern": r"\b(explain|discuss|compare|analyze|justify|solution)\b",
        "weight": 2,
    },
    "question_word": {"pattern": r"\bquestion\b", "weight": 1},
}

SOFT_THRESHOLD = 10  # Required combined score

TEMP_DOWNLOAD_FOLDER = "./temp_pdfs"  # Temporary folder for PDFs


# **Detect if text is primarily English**
def is_text_english(text, min_confidence=0.9):
    try:
        detected_lang = langdetect.detect_langs(text)
        for lang in detected_lang:
            if lang.lang == "en" and lang.prob >= min_confidence:
                return True
    except langdetect.lang_detect_exception.LangDetectException:
        return False  # Assume non-English if detection fails
    return False


def download_pdf(url, save_folder):
    filename = os.path.join(save_folder, os.path.basename(url))

    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(filename, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    pdf_file.write(chunk)
        return filename
    except requests.RequestException as e:
        print(f"❌ Failed to download {url}: {e}")
        return None


def has_sequential_enumeration(numbers, required_length=SEQUENTIAL_ENUM_REQUIRED):
    """
    Checks if the list of numbers contains a **sequence** of consecutive integers
    (e.g., 1,2,3,...,10) that is at least `required_length`.
    """
    if not numbers:
        return False

    current_run = 1
    for i in range(1, len(numbers)):
        if numbers[i] == numbers[i - 1] + 1:
            current_run += 1
            if current_run >= required_length:
                return True
        else:
            current_run = 1
    return False


def analyze_pdf(pdf_path, chunk_size=10, soft_threshold=SOFT_THRESHOLD):
    """
    Analyzes a PDF and computes a **combined score** based on:
    - Hard heuristics (structured enumeration)
    - Soft heuristics (question marks, blanks, key phrases)
    - Language detection (rejects non-English)
    """

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"❌ Error opening PDF '{pdf_path}': {e}")
        return False

    # Feature tracking
    hard_counts = {key: 0 for key in HARD_HEURISTICS.keys()}
    enumeration_numbers = []
    soft_score = 0
    n_pages = doc.page_count
    non_english_pages = 0

    for i in range(0, n_pages, chunk_size):
        for j in range(i, min(i + chunk_size, n_pages)):
            page = doc[j]
            text = page.get_text()
            text_lower = text.lower()

            if not is_text_english(text, 0.8):
                non_english_pages += 1

            # **Count occurrences of hard heuristics**
            for key, settings in HARD_HEURISTICS.items():
                matches = re.findall(settings["pattern"], text, flags=re.IGNORECASE)
                hard_counts[key] += len(matches)

            # **Extract numbers for sequential check**
            for match in re.finditer(r"\b(\d+)[\.\)]", text):
                try:
                    enumeration_numbers.append(int(match.group(1)))
                except ValueError:
                    continue

            # **Calculate soft heuristic score**
            for key, settings in SOFT_HEURISTICS.items():
                matches = re.findall(
                    settings["pattern"], text_lower, flags=re.IGNORECASE
                )
                soft_score += len(matches) * settings["weight"]

    doc.close()

    # **Reject PDFs with too much non-English content**
    if non_english_pages / max(1, n_pages) > 0.05:
        print(f"❌ PDF rejected (not enough English content): {pdf_path}")
        return False

    # **Calculate final heuristic score**
    total_hard_score = sum(
        hard_counts[key] * HARD_HEURISTICS[key]["weight"] for key in HARD_HEURISTICS
    )
    sequential_bonus = 5 if has_sequential_enumeration(enumeration_numbers) else 0
    total_score = total_hard_score + soft_score + sequential_bonus

    print(f"📊 PDF Analysis Summary for {pdf_path}:")
    print(f"   🔹 Hard Score: {total_hard_score}")
    print(f"   🔹 Soft Score: {soft_score:.2f}")
    print(f"   🔹 Sequential Enumeration Bonus: {sequential_bonus}")
    print(f"   🔹 **Final Score**: {total_score}\n")

    return total_score >= soft_threshold


def process_pdfs(urls=None, folder_path=None):
    """
    Processes PDFs from either a list of URLs or a local folder.

    Prints the name or URL of PDFs that **pass the heuristic score threshold**.
    """
    if not urls and not folder_path:
        print("⚠️ No PDFs provided. Provide URLs or a folder path.")
        return

    # **Process PDFs from URLs**
    if urls:
        print("\n🔄 Processing PDFs from URLs...")
        for url in tqdm(urls, desc="Downloading & Analyzing"):
            pdf_path = download_pdf(url, TEMP_DOWNLOAD_FOLDER)
            if pdf_path and analyze_pdf(pdf_path):
                print(f"✅ **Good PDF Found (URL):** {url}")

    # **Process PDFs from local folder**
    if folder_path:
        print("\n🔄 Processing PDFs from Local Folder...")
        if not os.path.exists(folder_path):
            print(f"⚠️ Folder not found: {folder_path}")
            return

        for filename in tqdm(os.listdir(folder_path), desc="Analyzing Local PDFs"):
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(folder_path, filename)
                if analyze_pdf(pdf_path):
                    print(f"✅ **Good PDF Found (Local):** {filename}")


# **Main Execution**
if __name__ == "__main__":
    urls = [
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2010.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2009.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2008.pdf",
        "https://uwaterloo.ca/chemistry/sites/default/files/uploads/documents/avogadro-exam-solution-2007.pdf",
        "https://www.mit.edu/~anugrah/files/MockIChOSolutions.pdf",
        "https://www.mit.edu/~anugrah/files/2012CChOLocalSoln.pdf",
        "https://jeeadv.ac.in/past_qps/2024_1_English.pdf",
        "https://jeeadv.ac.in/past_qps/2024_2_English.pdf",
    ]

    folder_path = os.path.expanduser("~/Downloads/0000")

    process_pdfs(urls=urls, folder_path=folder_path)
